{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "05b6ca26",
   "metadata": {},
   "source": [
    "### Process STAR Fusion Max Sensitivity Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b125a8d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from pathlib import Path "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1f20c61c",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "misexp_vrnt_feat_path = wkdir_path.joinpath(\"6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv\")\n",
    "misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep=\"\\t\")\n",
    "star_fusion_max_sensitivity = wkdir_path.joinpath(\"6_misexp_dissect/star_fusion/results_max_sensitivity\")\n",
    "star_fusion_max_sensitivity_path = Path(star_fusion_max_sensitivity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5a451061",
   "metadata": {},
   "outputs": [],
   "source": [
    "misexp_tx_fusion_df_list = []\n",
    "for index,row in misexp_vrnt_feat_df.iterrows():\n",
    "    vrnt_id = row[\"vrnt_id\"]\n",
    "    misexp_gene_id = row[\"gene_id\"]\n",
    "    rna_id = row[\"rna_id\"]\n",
    "    # load STAR fusion predictions \n",
    "    star_fusion_predict_path = star_fusion_max_sensitivity_path.joinpath(f\"{rna_id}/star-fusion.fusion_predictions.abridged.tsv\")\n",
    "    star_fusion_predict_df = pd.read_csv(star_fusion_predict_path, sep=\"\\t\")\n",
    "    # check if misexpressed gene in LeftGene\n",
    "    if not star_fusion_predict_df[star_fusion_predict_df.LeftGene.str.contains(misexp_gene_id)].empty:\n",
    "        misexp_fusion_df = star_fusion_predict_df[star_fusion_predict_df.LeftGene.str.contains(misexp_gene_id)].copy()\n",
    "        misexp_fusion_df[\"rna_id\"] = rna_id \n",
    "        misexp_fusion_df[\"vrnt_id\"] = vrnt_id \n",
    "        misexp_fusion_df[\"gene_id\"] = misexp_gene_id \n",
    "        misexp_tx_fusion_df_list.append(misexp_fusion_df)\n",
    "    # check if misexpressed gene in RightGene\n",
    "    elif not star_fusion_predict_df[star_fusion_predict_df.RightGene.str.contains(misexp_gene_id)].empty:\n",
    "        misexp_fusion_df = star_fusion_predict_df[star_fusion_predict_df.RightGene.str.contains(misexp_gene_id)].copy()\n",
    "        misexp_fusion_df[\"rna_id\"] = rna_id \n",
    "        misexp_fusion_df[\"vrnt_id\"] = vrnt_id \n",
    "        misexp_fusion_df[\"gene_id\"] = misexp_gene_id \n",
    "        misexp_tx_fusion_df_list.append(misexp_fusion_df)\n",
    "    else:\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "61b43eb2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of different fusion gene pairs: 15\n",
      "Number of different types of fusion event: 21\n"
     ]
    }
   ],
   "source": [
    "misexp_tx_fusion_df = pd.concat(misexp_tx_fusion_df_list).reset_index(drop=True)\n",
    "fusion_names = misexp_tx_fusion_df['#FusionName'].unique()\n",
    "print(f\"Number of different fusion gene pairs: {len(fusion_names)}\")\n",
    "\n",
    "# some fusion events have the same name but involve different breakpoints \n",
    "misexp_tx_fusion_df[\"fusion_id\"] = misexp_tx_fusion_df[\"#FusionName\"] + \"|\" + misexp_tx_fusion_df[\"LeftBreakpoint\"] + \"|\" + misexp_tx_fusion_df[\"RightBreakpoint\"]\n",
    "fusion_ids = misexp_tx_fusion_df.fusion_id.unique()\n",
    "print(f\"Number of different types of fusion event: {len(fusion_ids)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ef4f23da",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of misexpression-associated SVs in cis to fusion event: 24\n",
      "Number of misexpression-associated SVs with consistent fusion events: 22\n"
     ]
    }
   ],
   "source": [
    "# check that fusion events occur in all samples carrying variant \n",
    "vrnts_consistent_fusion = []\n",
    "misexp_tx_fusion_vrnt_ids = misexp_tx_fusion_df.vrnt_id.unique()\n",
    "print(f\"Number of misexpression-associated SVs in cis to fusion event: {len(misexp_tx_fusion_vrnt_ids)}\")\n",
    "for vrnt_id in misexp_tx_fusion_vrnt_ids: \n",
    "    fusion_rna_ids = set(misexp_tx_fusion_df[misexp_tx_fusion_df.vrnt_id == vrnt_id].rna_id.unique())\n",
    "    misexp_rna_ids = set(misexp_vrnt_feat_df[misexp_vrnt_feat_df.vrnt_id == vrnt_id].rna_id.unique())\n",
    "    if fusion_rna_ids == misexp_rna_ids: \n",
    "        vrnts_consistent_fusion.append(vrnt_id)\n",
    "print(f\"Number of misexpression-associated SVs with consistent fusion events: {len(vrnts_consistent_fusion)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0400cc70",
   "metadata": {},
   "outputs": [],
   "source": [
    "# select fusion events that are supported by all variant carriers \n",
    "fusion_vrnt_carriers_df = misexp_tx_fusion_df[[\"vrnt_id\", \"rna_id\", \"fusion_id\", \"LeftBreakpoint\", \"RightBreakpoint\"]].drop_duplicates()\n",
    "# count number of samples fusion is observed in  \n",
    "fusion_vrnt_carriers_count_df = fusion_vrnt_carriers_df.groupby([\"vrnt_id\", \"fusion_id\", \"LeftBreakpoint\", \"RightBreakpoint\"], as_index=False).rna_id.count().rename(columns={\"rna_id\": \"carrier_count_fusion\"})\n",
    "# count carriers \n",
    "misexp_vrnt_carrier_df = misexp_vrnt_feat_df[[\"vrnt_id\", \"rna_id\"]].drop_duplicates()\n",
    "misexp_vrnt_carrier_count_df = misexp_vrnt_carrier_df.groupby(\"vrnt_id\", as_index=False).rna_id.nunique().rename(columns={\"rna_id\": \"carrier_count_total\"})\n",
    "misexp_fusion_vrnt_carrier_df = pd.merge(misexp_vrnt_carrier_count_df, \n",
    "                                         fusion_vrnt_carriers_count_df, \n",
    "                                         on=\"vrnt_id\", how=\"inner\")\n",
    "# restrict to fusion events observed across all carriers \n",
    "misexp_fusion_vrnt_consistent_df = misexp_fusion_vrnt_carrier_df[misexp_fusion_vrnt_carrier_df.carrier_count_total == misexp_fusion_vrnt_carrier_df.carrier_count_fusion].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fdc3a12b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of variant IDs where all carriers have a low evidence fusion: 22\n",
      "Number of fusion IDs where all carriers have a low evidence fusion: 17\n"
     ]
    }
   ],
   "source": [
    "vrnt_id_consitent_fusion = misexp_fusion_vrnt_consistent_df.vrnt_id.unique()\n",
    "print(f\"Number of variant IDs where all carriers have a low evidence fusion: {len(vrnt_id_consitent_fusion)}\")\n",
    "fusion_id_consistent = misexp_fusion_vrnt_consistent_df.fusion_id.unique()\n",
    "print(f\"Number of fusion IDs where all carriers have a low evidence fusion: {len(fusion_id_consistent)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "02472b8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of fusion genes involving a misexpressed gene: 12\n",
      "Number of fusion transcripts: 12\n"
     ]
    }
   ],
   "source": [
    "misexp_tx_fusion_consistent_df = misexp_tx_fusion_df[misexp_tx_fusion_df.fusion_id.isin(fusion_id_consistent)].reset_index(drop=True).copy()\n",
    "misexp_gene_tx_fusion_events_df = misexp_tx_fusion_consistent_df.groupby(\"gene_id\", as_index=False)[[\"rna_id\", \"fusion_id\", \"vrnt_id\"]].nunique()\n",
    "fusion_events_misexp_gene = misexp_gene_tx_fusion_events_df.gene_id.unique()\n",
    "print(f\"Number of fusion genes involving a misexpressed gene: {len(fusion_events_misexp_gene)}\")\n",
    "# Number of fusion \n",
    "low_evidence_fusion_transcripts = misexp_tx_fusion_consistent_df[\"#FusionName\"].unique()\n",
    "print(f\"Number of fusion transcripts: {len(low_evidence_fusion_transcripts)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c055e46b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of RNA IDs for stringent STAR fusion filtering: 14\n"
     ]
    }
   ],
   "source": [
    "# write RNA IDs to file for stringent filtering \n",
    "misexp_rna_id_fusion = misexp_tx_fusion_df.rna_id.unique()\n",
    "print(f\"Number of RNA IDs for stringent STAR fusion filtering: {len(misexp_rna_id_fusion)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f4934d2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "star_fusion_rna_ids_max_sensitivity_path = wkdir_path.joinpath(\"6_misexp_dissect/star_fusion/star_fusion_rna_ids_max_sensitivity.txt\")\n",
    "with open(star_fusion_rna_ids_max_sensitivity_path, \"w\") as f_out: \n",
    "    for rna_id in misexp_rna_id_fusion: \n",
    "        f_out.write(f\"{rna_id}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f954b861",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
